from paddleocr import PaddleOCR, draw_ocr


PAGE_NUM = 10

ocr = PaddleOCR(use_angle_cls=True, page_num=PAGE_NUM)

# result = ocr.ocr(pdf_path, cls=True)
# for idx in range(len(result)):
#     res = result[idx]
#     if res == None: # 识别到空页就跳过，防止程序报错 / Skip when empty result detected to avoid TypeError:NoneType
#         print(f"[DEBUG] Empty page {idx+1} detected, skip it.")
#         continue
#     for line in res:
#         print(line)


def ocr_pdf(pdffile):
    result = ocr.ocr(pdffile, cls=True)
    result_str = ""
    # [[[[[123.0, 130.0], [330.0, 133.0], [329.0, 169.0], [122.0, 165.0]], ('测试 paddle ocr .', 0.9841340780258179)]]]
    for page in result:
        for result_item in page:
            if page == None:
                result += "\n\n"
            result_str += result_item[1][0] + "\n"
    return result_str


